# -*- coding: utf-8 -*-
# Python 2.7

# script for extracting numbers and their contexts as separate documents from wikipedia

from pypatnlp import *
from random import sample

etwiki = PyCorpus('data/etwiki.pycorp', readonly=True)
keys = sample(list(etwiki.keys()), 1000)
etwiki_nums = PyCorpus('data/etwiki_num_tmp.pycorp')
etwiki_nums.autocommit(False)

for doc_id in keys:
    print 'Processing ', doc_id
    doc = etwiki[doc_id]
    words = list(doc.word)
    cover = regex_doc_cover(doc, 'word', '[0-9]*')
    for i in cover.indices():
        etwiki_nums[unicode(doc_id.decode('utf-8')) + u'_' + unicode(i)] = doc.ix[i-3:i+4]
        
etwiki_nums.commit()
etwiki_nums.close()

as_t3corpus('data/etwiki_num_tmp.pycorp', 'data/etwiki_num.pycorp')
